Brief description of the data set and a summary of its attributes
Initial plan for data exploration
Actions taken for data cleaning and feature engineering
Key Findings and Insights, which synthesizes the results of Exploratory Data Analysis in an insightful and actionable manner
Formulating at least 3 hypothesis about this data
- Ho: µ Education_Attrition == µ Education_Not_Attrition
- Ha: µ Education_Attrition != µ Education_Not_Attrition
- Ho: µ Age_Attrition == µ Age_Not_Attrition
- Ha: µ Age_Attrition != µ Age_Not_Attrition
- Ho: µ Job_Satisfaction_Attrition == µ Job_Satisfaction_Not_Attrition
- Ha: µ Job_Satisfaction_Attrition != µ Job_Satisfaction_Not_Attrition
Conducting a formal significance test for one of the hypotheses and discuss the results
ss.kruskal(attrition_df['Education'], not_attrition_df['Education']) KruskalResult(statistic=1.3527640913093548, pvalue=0.2447954753326153)
pvalue > 0.05
There appears to be no statistically significant relationship between Attrition and Education, thus we fail reject the null hypothesis.
Suggestions for next steps in analyzing this data
A paragraph that summarizes the quality of this data set and a request for additional data if needed
Education: 1 'Below College', 2 'College', 3 'Bachelor', 4 'Master', 5 'Doctor'
Environment_Satisfaction: 1 'Low', 2 'Medium', 3 'High', 4 'Very High'
Job_Involvement: 1 'Low', 2 'Medium', 3 'High', 4 'Very High'
Job_Satisfaction: 1 'Low', 2 'Medium', 3 'High', 4 'Very High'
Performance_Rating: 1 'Low', 2 'Good', 3 'Excellent', 4 'Outstanding'
Relationship_Satisfaction: 1 'Low', 2 'Medium', 3 'High', 4 'Very High'
Work_Life_Balance: 1 'Bad', 2 'Good', 3 'Better', 4 'Best'
Distance_From_Home: Measured in Kilometers
Stock_Option_Level: Job_Level Scale
Job_Level: 1 - 5 scale
Percent_Salary_Hike: Percentage increase compared to the previous year
import pandas as pd
pd.set_option('display.max_columns', None)
from pandas_profiling import ProfileReport
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.offline as py
import plotly.graph_objs as go
import plotly.tools as tls
import plotly.figure_factory as ff
import scipy.stats as ss
import sklearn as skl
from scipy import stats
df = pd.read_csv('IBM_HR.csv')
print(df.shape)
df.head()
df.isnull().any()
df.info()
df.describe().T
clean_df = df.drop(columns=['EmployeeCount', 'Over18', 'StandardHours', 'EmployeeNumber'])
clean_df.head()
clean_df.Attrition.replace(to_replace = dict(Yes = 1, No = 0), inplace = True)
clean_df["Attrition"]
header = ['Age', 'Attrition', 'Business_Travel', 'Daily_Rate',
'Department', 'Distance_From_Home', 'Education', 'Education_Field',
'Environment_Satisfaction', 'Gender',
'Hourly_Rate', 'Job_Involvement', 'Job_Level', 'Job_Role',
'Job_Satisfaction', 'Marital_Status', 'Monthly_Income',
'Monthly_Rate', 'Num_Companies_Worked', 'Overtime',
'Percent_Salary_Hike', 'Performance_Rating', 'Relationship_Satisfaction',
'Stock_Option_Level','Total_Working_Years',
'Training_Times_Last_Year', 'Work_Life_Balance', 'Years_At_Company',
'Years_In_Current_Role', 'Years_Since_Last_Promotion',
'Years_With_Current_Manager']
clean_df.columns = header
clean_df.info()
clean_df.describe()
attrition = clean_df[(clean_df['Attrition'] != 0)]
no_attrition = clean_df[(clean_df['Attrition'] == 0)]
#Total Count of Yes/No 'Attrition'
trace = go.Bar(x = (len(attrition), len(no_attrition)), y = ['Yes_attrition', 'No_attrition'],
orientation = 'h', opacity = 0.8, marker=dict(color=['gold', 'lightskyblue'],
line=dict(color='#000000',width=1.5)))
layout = dict(title = 'Count of attrition variable')
fig = dict(data = [trace], layout=layout)
py.iplot(fig)
#Percentage of Yes/No 'Attrition'
trace = go.Pie(labels = ['No_attrition', 'Yes_attrition'], values = clean_df['Attrition'].value_counts(),
textfont=dict(size=15), opacity = 0.8,
marker=dict(colors=['lightskyblue','gold'],
line=dict(color='#000000', width=1.5)))
layout = dict(title = 'Distribution of attrition variable')
fig = dict(data = [trace], layout=layout)
py.iplot(fig)
def plot_distribution(var_select, bin_size):
# Calculate the correlation coefficient between the new variable and the target
corr = clean_df['Attrition'].corr(clean_df[var_select])
corr = np.round(corr,3)
tmp1 = attrition[var_select]
tmp2 = no_attrition[var_select]
hist_data = [tmp1, tmp2]
group_labels = ['Yes_attrition', 'No_attrition']
colors = ['#FFD700', '#7EC0EE']
fig = ff.create_distplot(hist_data, group_labels, colors = colors, show_hist = True,
curve_type='kde', bin_size = bin_size)
fig['layout'].update(title = var_select+' '+'(corr target ='+ str(corr)+')')
py.iplot(fig, filename = 'Density plot')
def barplot(var_select, x_no_numeric) :
tmp1 = clean_df[(clean_df['Attrition'] != 0)]
tmp2 = clean_df[(clean_df['Attrition'] == 0)]
tmp3 = pd.DataFrame(pd.crosstab(clean_df[var_select],clean_df['Attrition']), )
tmp3['Attr%'] = tmp3[1] / (tmp3[1] + tmp3[0]) * 100
if x_no_numeric == True :
tmp3 = tmp3.sort_values(1, ascending = False)
color=['lightskyblue','gold' ]
trace1 = go.Bar(
x=tmp1[var_select].value_counts().keys().tolist(),
y=tmp1[var_select].value_counts().values.tolist(),
name='Yes_Attrition',opacity = 0.8, marker=dict(
color='gold',
line=dict(color='#000000',width=1)))
trace2 = go.Bar(
x=tmp2[var_select].value_counts().keys().tolist(),
y=tmp2[var_select].value_counts().values.tolist(),
name='No_Attrition', opacity = 0.8, marker=dict(
color='lightskyblue',
line=dict(color='#000000',width=1)))
trace3 = go.Scatter(
x=tmp3.index,
y=tmp3['Attr%'],
yaxis = 'y2',
name='% Attrition', opacity = 0.6, marker=dict(
color='black',
line=dict(color='#000000',width=0.5
)))
layout = dict(title = str(var_select),
xaxis=dict(),
yaxis=dict(title= 'Count'),
yaxis2=dict(range= [-0, 75],
overlaying= 'y',
anchor= 'x',
side= 'right',
zeroline=False,
showgrid= False,
title= '% Attrition'
))
fig = go.Figure(data=[trace1, trace2, trace3], layout=layout)
py.iplot(fig)
plot_distribution('Age', False)
barplot('Age', False)
plot_distribution('Daily_Rate', 100)
plot_distribution('Distance_From_Home', False)
barplot('Distance_From_Home', False)
plot_distribution('Hourly_Rate', False)
plot_distribution('Monthly_Income', 100)
plot_distribution('Monthly_Rate', 100)
plot_distribution('Num_Companies_Worked', False)
barplot('Num_Companies_Worked',False)
plot_distribution('Percent_Salary_Hike', False)
barplot('Percent_Salary_Hike', False)
plot_distribution('Total_Working_Years', False)
barplot('Total_Working_Years', False)
plot_distribution('Training_Times_Last_Year', False)
barplot('Training_Times_Last_Year',False)
plot_distribution('Years_At_Company', False)
barplot('Years_At_Company', False)
plot_distribution('Years_In_Current_Role', False)
barplot('Years_In_Current_Role', False)
plot_distribution('Years_Since_Last_Promotion', False)
barplot('Years_Since_Last_Promotion', False)
plot_distribution('Years_With_Current_Manager', False)
barplot('Years_With_Current_Manager', False)
def plot_pie(var_select) :
colors = ['gold', 'lightgreen', 'lightcoral', 'lightskyblue', 'lightgrey', 'orange', 'white', 'lightpink']
trace1 = go.Pie(values = attrition[var_select].value_counts().values.tolist(),
labels = attrition[var_select].value_counts().keys().tolist(),
textfont=dict(size=15), opacity = 0.8,
hoverinfo = "label+percent+name",
domain = dict(x = [0,.48]),
name = "attrition employes",
marker = dict(colors = colors, line = dict(width = 1.5)))
trace2 = go.Pie(values = no_attrition[var_select].value_counts().values.tolist(),
labels = no_attrition[var_select].value_counts().keys().tolist(),
textfont=dict(size=15), opacity = 0.8,
hoverinfo = "label+percent+name",
marker = dict(colors = colors, line = dict(width = 1.5)),
domain = dict(x = [.52,1]),
name = "Non attrition employes" )
layout = go.Layout(dict(title = var_select + " distribution in employes attrition ",
annotations = [dict(text = "Yes_attrition",
font = dict(size = 13),
showarrow = False,
x = .22, y = -0.1),
dict(text = "No_attrition",
font = dict(size = 13),
showarrow = False,
x = .8,y = -.1)]))
fig = go.Figure(data = [trace1,trace2],layout = layout)
py.iplot(fig)
plot_pie("Gender")
barplot('Gender',True)
plot_pie('Overtime')
barplot('Overtime',True)
plot_pie('Business_Travel')
barplot('Business_Travel',True)
plot_pie('Job_Role')
barplot('Job_Role',True)
plot_pie('Department')
barplot('Department',True)
plot_pie('Marital_Status')
barplot('Marital_Status',True)
plot_pie('Education_Field')
barplot('Education_Field',True)
plot_pie('Education')
barplot('Education',False)
plot_pie('Environment_Satisfaction')
barplot('Environment_Satisfaction',False)
plot_pie('Job_Involvement')
barplot('Job_Involvement', False)
plot_pie('Job_Level')
barplot('Job_Level',False)
plot_pie('Job_Satisfaction')
barplot('Job_Satisfaction',False)
plot_pie('Performance_Rating')
barplot('Performance_Rating',False)
plot_pie('Relationship_Satisfaction')
barplot('Relationship_Satisfaction', False)
plot_pie('Stock_Option_Level')
barplot('Stock_Option_Level', False)
plot_pie('Work_Life_Balance')
barplot('Work_Life_Balance', False)
Education_sum = clean_df.groupby(['Education']).sum()
Education_sum
Education = clean_df.groupby(['Attrition', 'Education']).mean()
Education
clean_df.groupby(['Education', 'Attrition']).count()
attrition_df = clean_df[clean_df['Attrition'] == 1]
not_attrition_df = clean_df[clean_df['Attrition'] == 0]
attrition_df
not_attrition_df
stats.shapiro(attrition_df['Education'])
stats.shapiro(not_attrition_df['Education'])
stats.ttest_ind(attrition_df['Education'],
not_attrition_df['Education'], equal_var = False)
ss.kruskal(attrition_df['Education'], not_attrition_df['Education'])
pvalue > 0.05
There appears to be no statistically significant relationship between Attrition and Education, thus we fail reject the null hypothesis.
stats.shapiro(attrition_df['Age'])
stats.shapiro(not_attrition_df['Age'])
stats.ttest_ind(attrition_df['Age'],
not_attrition_df['Age'], equal_var = False)
ss.kruskal(attrition_df['Age'], not_attrition_df['Age'])
pvalue < 0.05
There appears to be a statistically significant relationship between Attrition and the Age, thus we can reject the null hypothesis.
stats.shapiro(attrition_df['Job_Satisfaction'])
stats.shapiro(not_attrition_df['Job_Satisfaction'])
stats.ttest_ind(attrition_df['Job_Satisfaction'],
not_attrition_df['Job_Satisfaction'], equal_var = False)
ss.kruskal(attrition_df['Job_Satisfaction'], not_attrition_df['Job_Satisfaction'])
pvalue < 0.05
There appears to be a statistically significant relationship between Attrition and Job Satisfaction, thus we can reject the null hypothesis.
clean_df.to_csv ('clean_IBM_HR.csv', header=True)